The goal of this document is to look at residuals and see if we could use the residuals to look at some sort of adjusted “counts” (residuals can be negative, so we should probably find another name than “counts”). First, I fitted zinbwave without the batches in the X, then I included the batches in the X. For each scenario, I looked at naive, naive on the log scale, Pearson, and deviance residuals. I plotted the PCA of the residuals with two coloring (batch and clusters) and boxplots of the residuals for each cell.
I am still using the data Davide gave me a while ago. And I think now it is not the same data Davide is using in its most recent vignette (that is zinb_clustering_over_k.Rmd). It should not be very important but it would be nice if we have a common dataset we can work on.
load("../data/Expt4c2b_filtdata.Rda")
load("../data/E4c2b_slingshot_wsforkelly.RData")
de = read.csv('../data/oe_markers.txt', stringsAsFactors = F, header = F)
de = de$V1
Here we only look at the 1000 most variable genes.
names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]
We have 616 cells. To speed up the computations, we will focus on the top 1,000 most variable genes.
dim(core)
## [1] 1000 616
core[1:3, 1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 8763 7221 3581
## Cbr2 5799 3638 1448
## Cyp2f2 2158 2027 1078
Cells have been processed in 18 different batches
table(batch)
## batch
## GBC08A GBC08B GBC09A GBC09B P01 P02 P03A P03B P04 P05
## 36 33 30 17 25 43 37 33 14 20
## P06 P10 P11 P12 P13 P14 Y01 Y04
## 39 36 44 40 47 39 51 32
Cells have been clustered into 13 different clusters
table(clus.labels)
## clus.labels
## 1 2 3 4 5 7 8 9 10 11 12 14 15
## 91 25 56 40 96 60 28 79 26 22 35 26 32
Batches are kind of confounded with the biology
table(data.frame(batch = as.vector(batch),
cluster = clus.labels))
## cluster
## batch 1 2 3 4 5 7 8 9 10 11 12 14 15
## GBC08A 0 2 12 9 0 0 0 0 0 2 0 2 9
## GBC08B 0 7 5 3 0 0 0 1 2 4 0 5 6
## GBC09A 0 1 5 9 0 0 0 1 1 0 0 6 7
## GBC09B 0 2 2 7 0 0 0 3 0 0 0 3 0
## P01 0 2 4 3 15 1 0 0 0 0 0 0 0
## P02 2 0 9 3 15 3 3 2 3 0 2 1 0
## P03A 3 0 3 0 12 2 9 4 2 0 2 0 0
## P03B 1 2 1 1 11 1 2 10 1 1 2 0 0
## P04 0 0 0 0 11 1 0 1 1 0 0 0 0
## P05 0 0 0 1 11 3 0 1 0 2 2 0 0
## P06 1 2 3 0 8 2 4 8 4 1 2 2 2
## P10 3 1 4 0 4 5 9 2 0 2 5 0 1
## P11 2 1 1 0 1 5 1 22 3 1 6 0 1
## P12 0 2 0 0 4 10 0 8 2 3 6 4 1
## P13 1 2 4 0 4 15 0 4 5 6 1 3 2
## P14 0 0 1 2 0 12 0 12 2 0 7 0 3
## Y01 47 1 1 2 0 0 0 0 0 0 0 0 0
## Y04 31 0 1 0 0 0 0 0 0 0 0 0 0
plotBoxplot <- function(y, main = '', col, log = F){
if (log) y = log2(y + 1)
boxplot(y, main=main, col=col, staplewex=0, outline=0,
border=col, xaxt='n')
abline(h=0)
}
compute.naive.residuals <- function(Y, zinb){
mu_hat = t(getMu(zinb))
pi_hat = t(getPi(zinb))
Y_hat = (1 - pi_hat) * mu_hat
Y - Y_hat
}
compute.log.naive.residuals <- function(Y, zinb){
mu = t(getMu(zinb))
pi = t(getPi(zinb))
phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
Y_hat = (1 - pi) * mu
log_Y_hat_plus_1 = log(1 + Y_hat) - var_hat / (2*(1 + Y_hat)^2)
log(Y + 1) - log_Y_hat_plus_1
}
compute.pearson.residuals <- function(Y, zinb){
num = compute.naive.residuals(Y, zinb)
mu = t(getMu(zinb))
pi = t(getPi(zinb))
phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
num / sqrt(var_hat)
}
compute.zinb.loglik <- function(Y, zinb){
mu = t(getMu(zinb))
theta = getTheta(zinb)
theta_mat = matrix(rep(theta, ncol(Y), ncol = ncol(Y)))
pi = t(getPi(zinb))
log( pi * (Y == 0) + (1 - pi) * dnbinom(Y, size = theta, mu = mu) )
}
compute.deviance.residuals <- function(Y, zinb){
mu_hat = t(getMu(zinb))
pi_hat = t(getPi(zinb))
Y_hat = (1 - pi_hat) * mu_hat
ll = compute.zinb.loglik(Y, zinb)
sign = 1*(Y - Y_hat > 0)
sign[sign == 0] = -1
sign * sqrt(-2 * ll)
}
Let’s run zinbwave with K = 0 and X with an intercept only (no batch).
print(system.time(zinb <- zinbFit(core, ncores = NCORES, K = 0)))
## user system elapsed
## 164.974 43.945 75.115
Rn = compute.naive.residuals(core, zinb)
Rn[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 3849.11026 2338.18336 -308.9602
## Cbr2 3326.65478 1193.04910 -490.0451
## Cyp2f2 36.88932 -74.65741 -591.2817
PCA (centered not scale) of residuals, first colored by batch (left), then by clusters (right). On the left side, we should not see patterns whereas on the right hand side we should.
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
de_1000 = de[de %in% rownames(Rn)]
length(de_1000)
## [1] 31
head(de_1000)
## [1] "Ebf1" "Ccnd2" "Krt14" "Trp63" "Sox9" "Kitl"
Each boxplot is for a cell. Colors correspond to the batches. When colors are the same but boxplots are not next to each others it corresponds to different batches. We would expect that the residuals look similar for the different batches.
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
breaks = setBreaks(Rn[de_1000, ], breaks = 0.99)
heatmap.2(Rn[de_1000, ], breaks = breaks,
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Naive Residuals')
Rn = compute.log.naive.residuals(core, zinb)
Rn[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 2.256998 2.071413 1.599005
## Cbr2 2.575342 2.132643 1.456709
## Cyp2f2 1.721755 1.676310 1.283519
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rn[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Naive Residuals')
Rp = compute.pearson.residuals(core, zinb)
Rp[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 0.427421118 0.26116965 -0.04329637
## Cbr2 0.724533374 0.26181816 -0.13516322
## Cyp2f2 0.009414916 -0.01918576 -0.19083820
pca = prcomp(t(Rp))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rp_order = Rp[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rp_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rp[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Pearson residuals')
Rd = compute.deviance.residuals(core, zinb)
Rd[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 4.669326 4.620308 -4.460612
## Cbr2 4.601415 4.474531 -4.263009
## Cyp2f2 4.347795 -4.334302 -4.193403
pca = prcomp(t(Rd))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rd_order = Rd[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rd_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rd[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Deviance residuals')
Let’s run zinbwave with K = 0 and X with an intercept only (no batch).
mod = model.matrix( ~ batch)
print(system.time(zinb_batch <- zinbFit(core, ncores = NCORES, K = 0,
X = mod)))
## user system elapsed
## 298.001 59.765 112.251
Rn = compute.naive.residuals(core, zinb_batch)
Rn[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 2384.4457 610.1331 -1611.654
## Cbr2 1994.4479 -296.8525 -1636.557
## Cyp2f2 664.3703 482.0526 -133.216
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
breaks = setBreaks(Rn[de_1000, ], breaks = 0.99)
heatmap.2(Rn[de_1000, ], breaks = breaks,
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Naive residuals')
Rn = compute.log.naive.residuals(core, zinb)
Rn[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 2.256998 2.071413 1.599005
## Cbr2 2.575342 2.132643 1.456709
## Cyp2f2 1.721755 1.676310 1.283519
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rn[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Naive residuals')
Rp = compute.pearson.residuals(core, zinb_batch)
Rp[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 0.2207362 0.05449718 -0.18326782
## Cbr2 0.3078798 -0.04424473 -0.31074163
## Cyp2f2 0.2612865 0.18304143 -0.06443715
pca = prcomp(t(Rp))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rp_order = Rp[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rp_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rp[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Pearson residuals')
Rd = compute.deviance.residuals(core, zinb_batch)
Rd[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1 4.637339 4.591452 -4.440573
## Cbr2 4.552446 -4.441892 -4.248676
## Cyp2f2 4.327764 4.310825 -4.159757
pca = prcomp(t(Rd))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rd_order = Rd[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rd_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rd[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Deviance residuals')
As batches are somehow confounded with the biology, I don’t know if we should adjust or not for the batches. For the residuals, it seems that the Pearson residuals when not adjusting for batches is the most informative (at least, when looking at PC 1 and 2).
sessionInfo()
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] clusterExperiment_1.2.0 SummarizedExperiment_1.6.1
## [3] DelayedArray_0.2.4 Biobase_2.36.2
## [5] GenomicRanges_1.28.3 GenomeInfoDb_1.12.0
## [7] IRanges_2.10.2 S4Vectors_0.14.2
## [9] BiocGenerics_0.22.0 gplots_3.0.1
## [11] matrixStats_0.52.2 zinbwave_0.1.4
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-131 bitops_1.0-6
## [3] bold_0.4.0 progress_1.1.2
## [5] httr_1.2.1 doParallel_1.0.10
## [7] RColorBrewer_1.1-2 rprojroot_1.2
## [9] prabclus_2.2-6 numDeriv_2016.8-1
## [11] tools_3.4.0 backports_1.1.0
## [13] R6_2.2.1 KernSmooth_2.23-15
## [15] DBI_0.6-1 lazyeval_0.2.0
## [17] colorspace_1.3-2 ade4_1.7-6
## [19] trimcluster_0.1-2 nnet_7.3-12
## [21] prettyunits_1.0.2 gridExtra_2.2.1
## [23] compiler_3.4.0 glmnet_2.0-10
## [25] pspline_1.0-17 xml2_1.1.1
## [27] pkgmaker_0.22 diptest_0.75-7
## [29] caTools_1.17.1 scales_0.4.1
## [31] DEoptimR_1.0-8 mvtnorm_1.0-6
## [33] robustbase_0.92-7 NMF_0.20.6
## [35] stringr_1.2.0 digest_0.6.12
## [37] rmarkdown_1.5 XVector_0.16.0
## [39] htmltools_0.3.6 stabledist_0.7-1
## [41] ADGofTest_0.3 limma_3.32.2
## [43] rlang_0.1.1 howmany_0.3-1
## [45] jsonlite_1.4 mclust_5.3
## [47] gtools_3.5.0 dplyr_0.5.0
## [49] dendextend_1.5.2 RCurl_1.95-4.8
## [51] magrittr_1.5 modeltools_0.2-21
## [53] GenomeInfoDbData_0.99.0 Matrix_1.2-10
## [55] Rcpp_0.12.11 munsell_0.4.3
## [57] ape_4.1 abind_1.4-5
## [59] viridis_0.4.0 stringi_1.1.5
## [61] whisker_0.3-2 yaml_2.1.14
## [63] MASS_7.3-47 zlibbioc_1.22.0
## [65] flexmix_2.3-14 MAST_1.2.1
## [67] plyr_1.8.4 grid_3.4.0
## [69] gdata_2.17.0 rncl_0.8.2
## [71] lattice_0.20-35 splines_3.4.0
## [73] knitr_1.16 uuid_0.1-2
## [75] taxize_0.8.4 fpc_2.1-10
## [77] rngtools_1.2.4 softImpute_1.4
## [79] reshape2_1.4.2 codetools_0.2-15
## [81] XML_3.98-1.7 evaluate_0.10
## [83] RNeXML_2.0.7 data.table_1.10.4
## [85] foreach_1.4.3 locfdr_1.1-8
## [87] tidyr_0.6.3 gtable_0.2.0
## [89] reshape_0.8.6 assertthat_0.2.0
## [91] kernlab_0.9-25 ggplot2_2.2.1
## [93] gridBase_0.4-7 phylobase_0.8.4
## [95] xtable_1.8-2 class_7.3-14
## [97] pcaPP_1.9-61 viridisLite_0.2.0
## [99] gsl_1.9-10.3 tibble_1.3.1
## [101] copula_0.999-16 iterators_1.0.8
## [103] registry_0.3 cluster_2.0.6